# _*_ coding: utf-8 _*_
# @Date : 2023/3/11 16:57
# @Author : Paul
# @File : clusters.py
# @Description : 聚类算法基类
import pandas as pd
import io
import matplotlib.pyplot as plt
from core.utils.string_utils import StringUtils
from sklearn.impute import SimpleImputer

from core.algo.base_algo import BaseAlgo
from core.data_source.meta_data_source.meta_data_source import MetaDataSource
from core.utils.data_souce_init_utils import DataSourceInitUtil
from core.utils.date_util import DateUtil


class Cluster(BaseAlgo):

    def __init__(self,
                 app_name="clusters",
                 data_source_id=None,
                 table_name=None,
                 feature_cols=None,
                 param=None,
                 ):
        """
        初始化类
        :param app_name:
        :param data_source_id:
        :param table_name:
        :param feature_cols:
        """
        # 开始时间
        self.start_time = DateUtil.getCurrentDate()

        super(Cluster, self).__init__(app_name=app_name)
        self.table_name = table_name
        self.feature_cols = feature_cols
        # 数据的摘要概要
        self.info = None
        # 数据统计学估计
        self.describe = None
        # 数据二维分布图
        self.two_dim_dis_image = self.image_path + "two_dim_dis_" + app_name + "_" + DateUtil.getCurrentDateSimple() + ".png"
        # 预测效果分布图
        self.cluster_pred_image = self.image_path + "cluster_pred_" + app_name + "_" + DateUtil.getCurrentDateSimple() + ".png"
        # 获取元数据库
        self.meta_data_source = MetaDataSource()
        # 获取训练集所在的数据源
        self.data_source = DataSourceInitUtil.getDataBase(self.meta_data_source,
                                                          data_source_id)
        # 降维后的数据集合
        self.train_data_dr = None
        self.param = param

    def getModelData(self):
        """
        获取建模数据：输出训练集、测试集
        :return:
        """
        data_query_sql = "select {} from {}".format(",".join(self.feature_cols),
                         self.table_name)
        data = self.data_source.queryAll(data_query_sql)
        data = pd.DataFrame(data=data,
                                       columns = self.feature_cols)
        #数据的简要摘要
        buf = io.StringIO()  # 创建一个StringIO，便于后续在内存中写入str
        data.info(buf=buf)  # 写入
        self.info = buf.getvalue()  # 读取

        #统计学估计
        self.describe = data.describe()

        # 获取预处理策略值
        process_method_list_after_process = []
        process_method_list = self.param.get("preProcessMethodList")
        if len(process_method_list) > 0:
            for process_method in process_method_list:
                if process_method == None or process_method == "null":
                    continue
                pre_process_feature = process_method.get("preProcessFeature")
                if StringUtils.isBlack(pre_process_feature):
                    continue
                else:
                    process_method_list_after_process.append(process_method)
        self.param["preProcessMethodList"] = process_method_list_after_process
        if len(process_method_list_after_process) > 0:
            for process_method in process_method_list_after_process:
                pre_process_feature = process_method.get("preProcessFeature")
                preProcessMethod = process_method.get("preProcessMethod")
                preProcessMethodValue = process_method.get("preProcessMethodValue")

                # 1.删除填充值
                if preProcessMethod == "deletena":
                    # data.drop(pre_process_feature, inplace=True, axis=1)
                    data.dropna(subset=[pre_process_feature],
                      axis=0, # axis=0表示删除行；
                      how='any', # how=any表示若列name、age中，任意一个出现空值，就删掉该行
                      inplace=True # inplace=True表示在原df上进行修改；
                    )
                # 2.替换缺失值
                elif preProcessMethod == "fillna":
                    if preProcessMethodValue == "mean":
                        imp_mean = SimpleImputer()
                        data[pre_process_feature] = imp_mean.fit_transform(
                            data[pre_process_feature].values.reshape(-1, 1))
                    elif preProcessMethodValue == "median":
                        imp_median = SimpleImputer(strategy="median")
                        data[pre_process_feature] = imp_median.fit_transform(
                            data[pre_process_feature].values.reshape(-1, 1))
                    elif preProcessMethodValue == "most_frequent":
                        imp_mode = SimpleImputer(strategy="most_frequent")
                        data[pre_process_feature] = imp_mode.fit_transform(
                            data[pre_process_feature].values.reshape(-1, 1))
                    elif preProcessMethodValue == "constant_0":
                        imp_0 = SimpleImputer(strategy="constant", fill_value=0)
                        data[pre_process_feature] = imp_0.fit_transform(data[pre_process_feature].values.reshape(-1, 1))
                    elif preProcessMethodValue == "constant_1":
                        imp_1 = SimpleImputer(strategy="constant", fill_value=1)
                        data[pre_process_feature] = imp_1.fit_transform(data[pre_process_feature].values.reshape(-1, 1))
                # 3.分类变量转换为数值变量
                elif preProcessMethod == "transClassFeature":
                    unique_value = data[pre_process_feature].unique().tolist()
                    data[pre_process_feature] = data[pre_process_feature].apply(lambda x: unique_value.index(x))
                # 4.类型转换
                elif preProcessMethod == "transType":
                    if preProcessMethodValue == "int":
                        data[pre_process_feature] = data[pre_process_feature].astype("int")
                    elif preProcessMethodValue == "float":
                        data[pre_process_feature] = data[pre_process_feature].astype("float")

        #生成数据二维分布图
        if (len(self.feature_cols) == 2) :
            fig, ax1 = plt.subplots(1)
            ax1.scatter(data[self.feature_cols[0]],
                        data[self.feature_cols[1]],
                        marker="o",
                        s=15)
            plt.savefig(self.two_dim_dis_image, dpi=300)
            # plt.show()
        elif (len(self.feature_cols) == 3):
            ax = plt.subplot(projection='3d')  # 创建一个三维的绘图工程
            ax.set_title('3d_image_show')  # 设置本图名称
            ax.scatter(data[self.feature_cols[0]],
                       data[self.feature_cols[1]],
                       data[self.feature_cols[2]],
                       s=15,
                       marker="o",
                       c='r')  # 绘制数据点 c: 'r'红色，'y'黄色，等颜色
            plt.savefig(self.two_dim_dis_image, dpi=300)
            # plt.show()
        elif (len(self.feature_cols) > 3):
            from sklearn.decomposition import PCA
            self.train_data_dr = PCA(n_components=2).fit_transform(data)
            ax = plt.subplot()  # 创建一个三维的绘图工程
            # ax.set_title('3d_image_show')  # 设置本图名称
            ax.scatter(self.train_data_dr[:, 0],
                       self.train_data_dr[:, 1],
                       # self.train_data_dr[:, 2],
                       s=15,
                       marker="o",
                       c='r')  # 绘制数据点 c: 'r'红色，'y'黄色，等颜色
            plt.savefig(self.two_dim_dis_image, dpi=300)
            # plt.show()


        # 数据无纲量化策略
        standardization = self.param["standardization"]
        if standardization == "MinMaxScaler":
            from sklearn.preprocessing import MinMaxScaler
            scaler = MinMaxScaler()
            data = scaler.fit_transform(data)
        elif standardization == "StandardScaler":
            from sklearn.preprocessing import StandardScaler
            scaler = StandardScaler()
            data = scaler.fit_transform(data)

        return data, data


if __name__ == '__main__':
    cluster = Cluster(app_name="cluster_demo", data_source_id=9, table_name="data1", feature_cols=["age", "income"])
    cluster.getModelData()